{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4cd1eaa8-3424-41ad-9cf2-3e8548712865",
   "metadata": {},
   "outputs": [],
   "source": [
    "import io\n",
    "\n",
    "import requests\n",
    "import docx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8180e7e4-b90d-4900-a59b-d22e5d6537c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_line(line):\n",
    "    line = line.strip()\n",
    "    line = line.strip('\\uFEFF')\n",
    "    return line\n",
    "\n",
    "def read_faq(file_id):\n",
    "    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'\n",
    "    \n",
    "    response = requests.get(url)\n",
    "    response.raise_for_status()\n",
    "    \n",
    "    with io.BytesIO(response.content) as f_in:\n",
    "        doc = docx.Document(f_in)\n",
    "\n",
    "    questions = []\n",
    "\n",
    "    question_heading_style = 'heading 2'\n",
    "    section_heading_style = 'heading 1'\n",
    "    \n",
    "    heading_id = ''\n",
    "    section_title = ''\n",
    "    question_title = ''\n",
    "    answer_text_so_far = ''\n",
    "     \n",
    "    for p in doc.paragraphs:\n",
    "        style = p.style.name.lower()\n",
    "        p_text = clean_line(p.text)\n",
    "    \n",
    "        if len(p_text) == 0:\n",
    "            continue\n",
    "    \n",
    "        if style == section_heading_style:\n",
    "            section_title = p_text\n",
    "            continue\n",
    "    \n",
    "        if style == question_heading_style:\n",
    "            answer_text_so_far = answer_text_so_far.strip()\n",
    "            if answer_text_so_far != '' and section_title != '' and question_title != '':\n",
    "                questions.append({\n",
    "                    'text': answer_text_so_far,\n",
    "                    'section': section_title,\n",
    "                    'question': question_title,\n",
    "                })\n",
    "                answer_text_so_far = ''\n",
    "    \n",
    "            question_title = p_text\n",
    "            continue\n",
    "        \n",
    "        answer_text_so_far += '\\n' + p_text\n",
    "    \n",
    "    answer_text_so_far = answer_text_so_far.strip()\n",
    "    if answer_text_so_far != '' and section_title != '' and question_title != '':\n",
    "        questions.append({\n",
    "            'text': answer_text_so_far,\n",
    "            'section': section_title,\n",
    "            'question': question_title,\n",
    "        })\n",
    "\n",
    "    return questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7d3c2dd7-f64a-4dc7-a4e3-3e8aadfa720f",
   "metadata": {},
   "outputs": [],
   "source": [
    "faq_documents = {\n",
    "    'llm-zoomcamp': '1m2KexowAXTmexfC5rVTCSnaShvdUQ8Ag2IEiwBDHxN0',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f94efe26-05e8-4ae5-a0fa-0a8e16852816",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "llm-zoomcamp\n"
     ]
    }
   ],
   "source": [
    "documents = []\n",
    "\n",
    "for course, file_id in faq_documents.items():\n",
    "    print(course)\n",
    "    course_documents = read_faq(file_id)\n",
    "    documents.append({'course': course, 'documents': course_documents})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b21af5c-2f6d-49e7-92e9-ca229e2473b9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
